Compare vs PNAS

Code is partially based on /home/epi/ajaffe/Lieber/Projects/RNAseq/HippoPublic/clean_previous_hits.R.

Finding regions of interest

First I find the regions of the genome corresponding to the genes of interest.

## Track time spent on making the report
startTime <- Sys.time()

## Required pkgs
library("GenomicRanges")
## Loading required package: methods
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following object is masked from 'package:stats':
## 
##     xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unlist, unsplit
## 
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
library("ggbio")
## Loading required package: ggplot2
## Need specific help about ggbio? try mailing 
##  the maintainer or visit http://tengfei.github.com/ggbio/
## 
## Attaching package: 'ggbio'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     geom_bar, geom_rect, geom_segment, ggsave, stat_bin,
##     stat_identity, xlim
library("reshape2")
library("plyr")
## 
## Attaching package: 'plyr'
## 
## The following object is masked from 'package:IRanges':
## 
##     desc
## 
## The following object is masked from 'package:S4Vectors':
## 
##     rename
library("scales")
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:ggbio':
## 
##     rescale
library("TxDb.Hsapiens.UCSC.hg19.knownGene")
## Loading required package: GenomicFeatures
## Loading required package: AnnotationDbi
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## 
## Attaching package: 'AnnotationDbi'
## 
## The following object is masked from 'package:GenomeInfoDb':
## 
##     species
library("org.Hs.eg.db")
## Loading required package: DBI
## Gene symbol names of interest
#### Original names
## symbols <- c("HIST1H4E", "RN7SK", "CDR1", "SNORD89", "SNORA73A", "SCARNA17", "PAPD1", "CACNB2", "LRCH4", "SNORD42A", "SNORA47", "LENG8", "FAM123A", "HIVEP3", "HNRPH1", "ZGPAT", "ERF", "SNORD116-29", "C9orf139", "C9orf3", "KCNA2", "EXOC6B", "CENTB5", "TAOK2", "TNRC6C", "ADAMTS4", "MSH4", "C16orf72", "CCR5")

## http://www.genenames.org/data/hgnc_data.php?hgnc_id=25532
## http://www.genenames.org/data/hgnc_data.php?hgnc_id=26360
## http://www.genenames.org/data/hgnc_data.php?hgnc_id=5041
## http://www.genenames.org/data/hgnc_data.php?hgnc_id=16754

## Updated symbols
symbols <- c("HIST1H4E", "RN7SK", "CDR1", "SNORD89", "SNORA73A", "SCARNA17", "MTPAP", "CACNB2", "LRCH4", "SNORD42A", "SNORA47", "LENG8", "AMER2", "HIVEP3", "HNRNPH1", "ZGPAT", "ERF", "SNORD116-29", "C9orf139", "C9orf3", "KCNA2", "EXOC6B", "ACAP3", "TAOK2", "TNRC6C", "ADAMTS4", "MSH4", "C16orf72", "CCR5")


## Map gene symbol names to entrezid's

keys <- keys(org.Hs.eg.db, keytype = "ENTREZID")
columns <- c("SYMBOL")
map <- select(org.Hs.eg.db, keys, columns, keytype = "ENTREZID")
idx <- sapply(symbols, function(x) { 
    res <- which(map$SYMBOL == x)
    ifelse(length(res) > 0, res, NA)
})
ids <- map$ENTREZID[idx]
names(ids) <- names(idx)

## Remove those not-found
ids <- ids[!is.na(ids)]

## Find the exons

txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
exonsUCSC <- exons(txdb, vals = list(gene_id =  ids), columns = c("gene_id", "exon_id", "tx_name", "tx_id"))

## some gene ids have multiples, straighten out
gids <- as.list(exonsUCSC$gene_id)
for(i in seq(along=gids)) gids[[i]] <- gids[[i]][which(gids[[i]] %in% ids)]
gids <- unlist(gids)


## split into list by EID
exonListUCSC <- split(exonsUCSC, gids)
exonListUCSC <- exonListUCSC[ ids[ids %in% gids] ]
### drop duplicated exons
# exonListUCSC = lapply(exonListUCSC, function(x) x[!duplicated(x)])
# identical(names(exonListUCSC), ids[ids %in% gids]) # TRUE

## Not found
ids[which(!ids %in% gids)]
##    RN7SK SNORA73A 
## "125050"   "6080"
## Find them manually
# http://www.ncbi.nlm.nih.gov/gene/125050
# http://www.ncbi.nlm.nih.gov/gene/6080
missing <- GRanges(seqnames=c("chr6", "chr1"), ranges=IRanges(start=c(52860418, 28833877), end=c(52860749, 28834083)))
toAdd <- split(missing, 1:2)
names(toAdd) <- ids[which(!ids %in% gids)]

## Reduce to min/max per gene
windows <- c(GRangesList(lapply(exonListUCSC, range)), toAdd)
## 
## Attaching package: 'XVector'
## 
## The following object is masked from 'package:plyr':
## 
##     compact
## Save for later use
save(windows, ids, idx, file="windows.Rdata")

Plots

Now, lets make the plots.

## Find chrs used
chrs <- as.character(unique(unlist(seqnames(windows), use.names=FALSE)))

## Build ideograms
data(hg19IdeogramCyto, package = "biovizBase")
p.ideos <- lapply(chrs, function(xx) { 
    plotIdeogram(hg19IdeogramCyto, xx)
})
names(p.ideos) <- chrs

## load data needed
load("../CoverageInfo/fullCov.Rdata")
load("../derAnalysis/run3-v1.0.10/fullRegions.Rdata")
load("../derAnalysis/run3-v1.0.10/groupInfo.Rdata")
load("../derAnalysis/run3-v1.0.10/colsubset.Rdata")

## Filter data
fullCovSmall <- lapply(chrs, function(chr) {
    fullCov[[chr]][, colsubset]
})
names(fullCovSmall) <- chrs
rm(fullCov)

## Main plotting function
plotClusterCustom <- function(cluster, regions, titleName, coverageInfo, groupInfo, titleUse="fwer", txdb=NULL, p.ideogram=NULL, maxExtend=300L, colsubset=NULL, forceLarge=FALSE) {

    stopifnot(is.factor(groupInfo))
    if(is.null(colsubset)) colsubset <- seq_len(length(groupInfo))
    
    ## Window length
    l <-  width(cluster) + 2 * min(maxExtend, width(cluster))
    
    if(l > 1e5 & !forceLarge) {
        message(paste("No plot will be made because the data is too large. The window size exceeds 100 kb."))
        return(invisible(l))
    }
    
    wh <- resize(cluster, l, fix="center")
    title <- paste("Window view for ENTREZ Symbol", titleName)
    
    ## Plot the ideogram if not supplied
    if(is.null(p.ideogram)) {
        chr <- as.character(seqnames(wh))
        ## Now load the ideogram info
        hg19IdeogramCyto <- NULL
        load(system.file("data", "hg19IdeogramCyto.rda", package="biovizBase", mustWork=TRUE))
        p.ideogram <- plotIdeogram(hg19IdeogramCyto, chr)
    }
    
    ## Regions found (from the view)
    neighbors <- regions[queryHits(findOverlaps(regions, wh))]
    if(length(neighbors) == 0) {
        neighbors <- wh
        neighbors$significant <- NA
        neighbors$significantQval <- NA
        neighbors$significantFWER <- NA
    } 
    if(titleUse == "pval") {
        p.region <- autoplot(neighbors, aes(fill=significant)) + 
        scale_fill_manual(values=c("chartreuse4", "wheat2"), limits=c("TRUE", "FALSE")) 
    } else if (titleUse == "qval" ){
        p.region <- autoplot(neighbors, aes(fill=significantQval)) +
        scale_fill_manual(values=c("chartreuse4", "wheat2"), limits=c("TRUE", "FALSE")) 
    } else if (titleUse == "fwer" ){
        p.region <- autoplot(neighbors, aes(fill=significantFWER)) +
        scale_fill_manual(values=c("chartreuse4", "wheat2"), limits=c("TRUE", "FALSE")) 
    } else {
        p.region <- autoplot(neighbors)
    }

    ## Graphical parameters
    nGroups <- length(levels(groupInfo))
    
    ## Construct the coverage plot
    pos <- start(wh):end(wh)
    rawData <- as.data.frame(coverageInfo[pos, colsubset])
    rawData$position <- pos
    covData <- melt(rawData, id.vars="position")
    covData$group <- rep(groupInfo, each=nrow(rawData))
    p.coverage <- ggplot(covData, aes(x=position, y=value, group=variable, colour=group)) + geom_line(alpha=1/nGroups) + scale_y_continuous(trans=log2_trans())
    
    ## Construct mean by group coverage plot
    meanCoverage <- ddply(covData, c("position", "group"), summarise, meanCov=mean(value))
    p.meanCov <- ggplot(meanCoverage, aes(x=position, y=meanCov, colour=group)) + geom_line(alpha=1/max(1, 1/2 * nGroups)) + scale_y_continuous(trans=log2_trans())
    
    ## Annotation info and final plot
    if(is.null(txdb)) {
        p.transcripts <- FALSE
    } else {
        ## The tryCatch is needed because not all regions overlap a transcript
        p.transcripts <- tryCatch(autoplot(txdb, which = wh, names.expr = "tx_name(gene_id)"), error = function(e) { FALSE })
    }   
    if(!is.logical(p.transcripts)) {
        result <- tracks(p.ideogram, "Coverage" = p.coverage, "Mean coverage" = p.meanCov, "Regions" = p.region, "tx_name\n(gene_id)" = p.transcripts, heights = c(2, 4, 4, 1.5, 3), xlim=wh, title=title) + ylab("") + theme_tracks_sunset()       
    } else {
        result <- tracks(p.ideogram, "Coverage" = p.coverage, "Mean coverage" = p.meanCov, "Regions" = p.region, heights = c(2, 5, 5, 2), xlim=wh, title=title) + ylab("") + theme_tracks_sunset()
    }
    return(result)  
}

## Plotting function
regionClusterPlot <- function(i, tUse="fwer") {
    ## Chr specific selections
    chr <- as.character(seqnames(windows[[i]]))
    p.ideo <- p.ideos[[chr]]
    covInfo <- fullCovSmall[[chr]]
    
    ## Make the plot
    p <- plotClusterCustom(windows[[i]], regions=fullRegions, titleName=names(ids)[ids == names(windows)[i]], coverageInfo=covInfo, groupInfo=groupInfo, titleUse=tUse, txdb=txdb, p.ideogram=p.ideo, forceLarge=TRUE)
    print(p)
    rm(p.ideo, covInfo)
    
    return(invisible(TRUE)) 
}

## Make plots
for(i in seq_len(length(windows))) {
    regionClusterPlot(i)
}

Reproducibility

Date the report was generated.

## [1] "2014-12-17 18:50:11 EST"

Wallclock time spent generating the report.

## Time difference of 2.206 hours

R session information.

## Session info-----------------------------------------------------------------------------------------------------------
##  setting  value                                      
##  version  R version 3.1.1 Patched (2014-10-16 r66782)
##  system   x86_64, linux-gnu                          
##  ui       X11                                        
##  language (EN)                                       
##  collate  en_US.UTF-8                                
##  tz       
## Packages---------------------------------------------------------------------------------------------------------------
##  package                           * version  date       source                                   
##  acepack                             1.3.3.3  2013-05-03 CRAN (R 3.1.1)                           
##  AnnotationDbi                     * 1.28.1   2014-10-29 Bioconductor                             
##  base64enc                           0.1.2    2014-06-26 CRAN (R 3.1.0)                           
##  BatchJobs                           1.5      2014-10-30 CRAN (R 3.1.1)                           
##  BBmisc                              1.8      2014-10-30 CRAN (R 3.1.1)                           
##  Biobase                           * 2.26.0   2014-10-15 Bioconductor                             
##  BiocGenerics                      * 0.12.1   2014-11-15 Bioconductor                             
##  BiocParallel                        1.0.0    2014-10-15 Bioconductor                             
##  biomaRt                             2.22.0   2014-10-15 Bioconductor                             
##  Biostrings                          2.34.1   2014-12-13 Bioconductor                             
##  biovizBase                          1.14.1   2014-12-14 Bioconductor                             
##  bitops                              1.0.6    2013-08-17 CRAN (R 3.1.0)                           
##  brew                                1.0.6    2011-04-13 CRAN (R 3.1.0)                           
##  BSgenome                            1.34.0   2014-10-15 Bioconductor                             
##  Cairo                               1.5.6    2014-06-26 CRAN (R 3.1.0)                           
##  checkmate                           1.5.1    2014-12-14 CRAN (R 3.1.1)                           
##  cluster                             1.15.3   2014-09-04 CRAN (R 3.1.1)                           
##  codetools                           0.2.9    2014-08-21 CRAN (R 3.1.1)                           
##  colorspace                          1.2.4    2013-09-30 CRAN (R 3.1.0)                           
##  DBI                               * 0.3.1    2014-09-24 CRAN (R 3.1.1)                           
##  devtools                            1.6.1    2014-10-07 CRAN (R 3.1.1)                           
##  dichromat                           2.0.0    2013-01-24 CRAN (R 3.1.0)                           
##  digest                              0.6.6    2014-12-10 CRAN (R 3.1.1)                           
##  evaluate                            0.5.5    2014-04-29 CRAN (R 3.1.0)                           
##  fail                                1.2      2013-09-19 CRAN (R 3.1.0)                           
##  foreach                             1.4.2    2014-04-11 CRAN (R 3.1.0)                           
##  foreign                             0.8.61   2014-03-28 CRAN (R 3.1.1)                           
##  formatR                             1.0      2014-08-25 CRAN (R 3.1.1)                           
##  Formula                             1.1.2    2014-07-13 CRAN (R 3.1.1)                           
##  GenomeInfoDb                      * 1.2.3    2014-11-15 Bioconductor                             
##  GenomicAlignments                   1.2.1    2014-11-05 Bioconductor                             
##  GenomicFeatures                   * 1.18.3   2014-12-17 Bioconductor                             
##  GenomicRanges                     * 1.18.3   2014-11-20 Bioconductor                             
##  GGally                              0.5.0    2014-12-02 CRAN (R 3.1.1)                           
##  ggbio                             * 1.14.0   2014-11-04 Bioconductor                             
##  ggplot2                           * 1.0.0    2014-05-21 CRAN (R 3.1.0)                           
##  graph                               1.44.1   2014-12-10 Bioconductor                             
##  gridExtra                           0.9.1    2012-08-09 CRAN (R 3.1.1)                           
##  gtable                              0.1.2    2012-12-05 CRAN (R 3.1.0)                           
##  Hmisc                               3.14.6   2014-11-22 CRAN (R 3.1.1)                           
##  htmltools                           0.2.6    2014-09-08 CRAN (R 3.1.1)                           
##  IRanges                           * 2.0.1    2014-12-13 Bioconductor                             
##  iterators                           1.0.7    2014-04-11 CRAN (R 3.1.0)                           
##  knitr                               1.8      2014-11-11 CRAN (R 3.1.1)                           
##  knitrBootstrap                      1.0.0    2014-11-19 Github (jimhester/knitrBootstrap@76c41f0)
##  labeling                            0.3      2014-08-23 CRAN (R 3.1.1)                           
##  lattice                             0.20.29  2014-04-04 CRAN (R 3.1.1)                           
##  latticeExtra                        0.6.26   2013-08-15 CRAN (R 3.1.0)                           
##  markdown                            0.7.4    2014-08-24 CRAN (R 3.1.1)                           
##  MASS                                7.3.35   2014-09-30 CRAN (R 3.1.1)                           
##  mime                                0.2      2014-09-26 CRAN (R 3.1.1)                           
##  munsell                             0.4.2    2013-07-11 CRAN (R 3.1.0)                           
##  nnet                                7.3.8    2014-03-28 CRAN (R 3.1.1)                           
##  OrganismDbi                         1.8.0    2014-10-15 Bioconductor                             
##  org.Hs.eg.db                      * 3.0.0    2014-09-27 Bioconductor                             
##  plyr                              * 1.8.1    2014-02-26 CRAN (R 3.1.0)                           
##  proto                               0.3.10   2012-12-22 CRAN (R 3.1.0)                           
##  RBGL                                1.42.0   2014-10-15 Bioconductor                             
##  RColorBrewer                        1.1.2    2014-12-07 CRAN (R 3.1.1)                           
##  Rcpp                                0.11.3   2014-09-29 CRAN (R 3.1.1)                           
##  RCurl                               1.95.4.5 2014-12-06 CRAN (R 3.1.1)                           
##  reshape                             0.8.5    2014-04-23 CRAN (R 3.1.0)                           
##  reshape2                          * 1.4.1    2014-12-06 CRAN (R 3.1.1)                           
##  rmarkdown                         * 0.3.3    2014-09-17 CRAN (R 3.1.1)                           
##  rpart                               4.1.8    2014-03-28 CRAN (R 3.1.1)                           
##  Rsamtools                           1.18.2   2014-11-12 Bioconductor                             
##  RSQLite                           * 1.0.0    2014-10-25 CRAN (R 3.1.1)                           
##  rstudioapi                          0.1      2014-03-27 CRAN (R 3.1.1)                           
##  rtracklayer                         1.26.2   2014-11-12 Bioconductor                             
##  S4Vectors                         * 0.4.0    2014-10-15 Bioconductor                             
##  scales                            * 0.2.4    2014-04-22 CRAN (R 3.1.0)                           
##  sendmailR                           1.2.1    2014-09-21 CRAN (R 3.1.1)                           
##  stringr                             0.6.2    2012-12-06 CRAN (R 3.1.0)                           
##  survival                            2.37.7   2014-01-22 CRAN (R 3.1.1)                           
##  TxDb.Hsapiens.UCSC.hg19.knownGene * 3.0.0    2014-09-29 Bioconductor                             
##  VariantAnnotation                   1.12.7   2014-12-13 Bioconductor                             
##  XML                                 3.98.1.1 2013-06-20 CRAN (R 3.1.0)                           
##  XVector                           * 0.6.0    2014-10-15 Bioconductor                             
##  yaml                                2.1.13   2014-06-12 CRAN (R 3.1.1)                           
##  zlibbioc                            1.12.0   2014-10-15 Bioconductor